#include <lua.h>
#include <lualib.h>
#include <lauxlib.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <assert.h>
#include <stdarg.h>


#if PTRDIFF_MAX > UINT32_MAX
#define CPU64
#define ltoint(a) ((lua_Integer)(a))
#define ltoptr(a) ((void*)(a))
#else
#define CPU32
#define ltoint(a) ((int)(a))
#define ltoptr(a) (void*)((int)a)
#endif

#if LUA_VERSION_NUM <502
#define lua_isinteger lua_isnumber
#endif

#ifndef lcheck_toptr
#define lcheck_toptr _lcheck_toptr
#define lua_isstring(a,idx) ((lua_type(a,idx)==LUA_TSTRING)?1:0)
static void* _lcheck_toptr(lua_State*L,int idx){
  void* data = 0;
  if(lua_isstring(L,idx)){
    data = (void*)lua_tostring(L,idx);
  }
  else if(lua_islightuserdata(L,idx)/*||lua_isuserdata(L,idx)*/){
    data = (void*)lua_touserdata(L,idx);
  }
  else if(lua_isuserdata(L,idx)){
    data = *(void**)lua_touserdata(L,idx);
  }
  else{
    lua_pushliteral(L,"type error:pointer_type");
    lua_error(L);
  };
  return data;
};
#endif

#ifdef   _WIN32
#include <windows.h>
   static int ldlopen(lua_State*L){
       void*file = lcheck_toptr(L,1);
       void* p = LoadLibrary(file);
       lua_pushlightuserdata(L,p);
       return 1;
   }
   
   static int ldlclose(lua_State*L){
       void*p = lcheck_toptr(L,1);
       int ret = FreeLibrary(p);  //非0则成功
       lua_pushinteger(L,ret);
       return 1;
   }
   
   static int ldlsym(lua_State*L){
       void*p = lcheck_toptr(L,1);
       void*name = lcheck_toptr(L,2);
       void*sym  =GetProcAddress(p,name);
       lua_pushlightuserdata(L,sym);
       return 1;
   }
#else
#include <dlfcn.h>
   static int ldlopen(lua_State*L){
       void*file = lcheck_toptr(L,1);
       void* p = dlopen(file,RTLD_LAZY);
       lua_pushlightuserdata(L,p);
       return 1;
   }
   
   static int ldlclose(lua_State*L){
       void*p = lcheck_toptr(L,1);
       int ret = dlclose(p);  //非0则成功
       lua_pushinteger(L,ret);
       return 1;
   }
   
   static int ldlsym(lua_State*L){
       void*p = lcheck_toptr(L,1);
       void*name = lcheck_toptr(L,2);
       void*sym  = dlsym(p,name);
       lua_pushlightuserdata(L,sym);
       return 1;
   }
#endif


static struct Arg{
    int     size;
    uint8_t *top;
    uint8_t parm[0x100];
}arg;

static void wdata(struct Arg*arg,void* data,int size){
    arg->top -= size;
    uint8_t *p = arg->top;
    memcpy(p,data,size);
    arg->size += size;
}

static void wptr(struct Arg*arg,void*prm){
    arg->top -= sizeof(prm);
    void**p = (void**)arg->top;
    p[0] = prm;
    arg->size += sizeof(prm);
}

#if 0
static void wtest(struct Arg*arg,va_list ap){
    printf("float start:%p\n",ap);
    //float v = va_arg(ap,float);
    //float v = va_arg(ap,double);
    int v = va_arg(ap,int);
    int v2 = va_arg(ap,int);
    printf("%f\n",v,v2);
    printf("float end  :%p\n",ap);
}
#endif

static void wfloat(struct Arg*arg,...){
    //x86 arm 浮点数压栈时，是两个整数
    va_list ap;
    va_start(ap,arg);
    void* v;
    int i;
    //wtest(arg,ap);
    int v1 = va_arg(ap,int);
    int v2 = va_arg(ap,int);
    wptr(arg,(void*)v2);
    wptr(arg,(void*)v1);
    va_end(ap);
}

#define wdouble wfloat

/*
#if defined(CPU32)
static void wint(struct Arg*arg,int v){
    uint8_t *p = arg->parm;
    p += arg->size;
    (*(int)p) = v;
    arg->size += 4;
}
#endif
*/

#define wdata(arg,v)   {typeof(v) __v = v; wdata(arg,(&__v),sizeof((__v)));}
#define wint(arg,v)    wdata(arg,v)
#define wint16(arg,v)  wint(arg,v)
#define wint8(arg,v)   wint(arg,v)

#if 0

static void* stackcpy(void*sp,struct Arg*arg){
    int* p = sp;
    printf("-----%p %p %p\n",arg->size,sp,arg);
    //printf("%d %d\n",p[0],p[1]);
    fflush(stdout);
    return memcpy(sp,arg->top,arg->size);
}
static int callA(void*func,struct Arg*arg);
static void* call(void*func,struct Arg*arg){
    asm(
"_callA:     \n"
    "movl   8(%esp),%ebx             \n"
    "movl   4(%esp),%ecx             \n"
    
    "pushl	%ebp                  \n"
    "movl	%esp,%ebp             \n"
    
    "subl   $100,%esp             \n"
    "movl   %esp,%eax             \n"
    "pushl  %ecx                  \n"
    "pushl  %ebx                  \n"
    "pushl  %eax                  \n"
    "call   _stackcpy             \n"
    "addl   $8,%esp               \n"
    "popl   %eax                  \n"
    //"pushl  %esp                  \n"
    //"call   _mm                   \n"
    "call  *%eax                  \n"
    
	"leave                        \n"
    "ret                          \n"
    );
#if defined(ARM32)
    asm(
"callA:                           \n"
    "push {lr,r4}                 \n"
    "sub  sp,#0xf8                \n"
    "mov  r2,r0                   \n"
    "mov  r0,sp                   \n"
    "push {r2}                    \n"
    "bl   stackcpy                \n"
    "pop  {r4}                    \n"
    "pop  {r0,r1,r2,r3}           \n"  //arg1-4
    "bl   __v_call_r4             \n"
    "add  sp,#0x10                \n"
    "pop  r2                      \n"
    "bx   r2                      \n"
    );
#endif
}
#define call callA

#if 1
int mm(int x,int y){
    printf("=======%d %d\n",x,y);
}

int main(int argc,char**argv){
    arg.size = 0;
    wint(&arg,10);
    wint(&arg,20);
    printf("s = %p %p %d\n",&arg,&mm,arg.size);
    callA(&mm,&arg);
    return 0;
}
#endif


#else

/*
function def(mc,out)
    for c=1,mc,1 do
        local arg = "";
        local com = "";
        for idx = 0,c-1 do
            arg = arg .. com .. "arg[0 - " .. idx .. "]";
            com = ","
        end
        local s = "case %d: return f( " .. arg .. " ); break;"
        s = string.format(s,c);
        --print(s);
        out:write(s,"\n")
    end
end

*/


static void* call(void*func,struct Arg*a){
    void *(*f)() = func;
    void** arg = (void**)(a->top);
    //printf("call %d:\n",a->size>>2);
#if defined(CPU32)
    switch(a->size>>2){
#elif defined(CPU64)
    switch(a->size>>3){
#endif
        case 0: return f(); break;
        case 1: return f( arg[0 + 0] ); break;
        case 2: return f( arg[0 + 0],arg[0 + 1] ); break;
        case 3: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2] ); break;
        case 4: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3] ); break;
        case 5: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4] ); break;
        case 6: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5] ); break;
        case 7: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6] ); break;
        case 8: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7] ); break;
        case 9: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8] ); break;
        case 10: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9] ); break;
        case 11: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10] ); break;
        case 12: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11] ); break;
        case 13: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11],arg[0 + 12] ); break;
        case 14: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11],arg[0 + 12],arg[0 + 13] ); break;
        case 15: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11],arg[0 + 12],arg[0 + 13],arg[0 + 14] ); break;
        case 16: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11],arg[0 + 12],arg[0 + 13],arg[0 + 14],arg[0 + 15] ); break;
        case 17: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11],arg[0 + 12],arg[0 + 13],arg[0 + 14],arg[0 + 15],arg[0 + 16] ); break;
        case 18: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11],arg[0 + 12],arg[0 + 13],arg[0 + 14],arg[0 + 15],arg[0 + 16],arg[0 + 17] ); break;
        case 19: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11],arg[0 + 12],arg[0 + 13],arg[0 + 14],arg[0 + 15],arg[0 + 16],arg[0 + 17],arg[0 + 18] ); break;
        case 20: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11],arg[0 + 12],arg[0 + 13],arg[0 + 14],arg[0 + 15],arg[0 + 16],arg[0 + 17],arg[0 + 18],arg[0 + 19] ); break;
        default:
            assert(0);
        break;
    }
}

#define _stdcall __attribute__((__stdcall__))

static void* call_stdcall(void*func,struct Arg*a){
    _stdcall void *(*f)() = func;
    void** arg = (void**)(a->top);
#if defined(CPU32)
    switch(a->size>>2){
#elif defined(CPU64)
    switch(a->size>>3){
#endif
        case 0: return f(); break;
        case 1: return f( arg[0 + 0] ); break;
        case 2: return f( arg[0 + 0],arg[0 + 1] ); break;
        case 3: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2] ); break;
        case 4: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3] ); break;
        case 5: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4] ); break;
        case 6: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5] ); break;
        case 7: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6] ); break;
        case 8: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7] ); break;
        case 9: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8] ); break;
        case 10: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9] ); break;
        case 11: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10] ); break;
        case 12: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11] ); break;
        case 13: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11],arg[0 + 12] ); break;
        case 14: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11],arg[0 + 12],arg[0 + 13] ); break;
        case 15: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11],arg[0 + 12],arg[0 + 13],arg[0 + 14] ); break;
        case 16: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11],arg[0 + 12],arg[0 + 13],arg[0 + 14],arg[0 + 15] ); break;
        case 17: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11],arg[0 + 12],arg[0 + 13],arg[0 + 14],arg[0 + 15],arg[0 + 16] ); break;
        case 18: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11],arg[0 + 12],arg[0 + 13],arg[0 + 14],arg[0 + 15],arg[0 + 16],arg[0 + 17] ); break;
        case 19: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11],arg[0 + 12],arg[0 + 13],arg[0 + 14],arg[0 + 15],arg[0 + 16],arg[0 + 17],arg[0 + 18] ); break;
        case 20: return f( arg[0 + 0],arg[0 + 1],arg[0 + 2],arg[0 + 3],arg[0 + 4],arg[0 + 5],arg[0 + 6],arg[0 + 7],arg[0 + 8],arg[0 + 9],arg[0 + 10],arg[0 + 11],arg[0 + 12],arg[0 + 13],arg[0 + 14],arg[0 + 15],arg[0 + 16],arg[0 + 17],arg[0 + 18],arg[0 + 19] ); break;
        default:
            assert(0);
        break;
    }
}
#endif

static struct Arg* getArgs(lua_State*L,int c){
    int top = lua_gettop(L);
    arg.size = 0;
    arg.top = &arg.parm[0x100];
    for(;top>c;top--){
        int type = lua_type(L,top);
        void *v;
        switch(type){
            case LUA_TNIL:
                v = 0;
            break;
            case LUA_TBOOLEAN:
                v = (void*)lua_toboolean(L,top);
                break;
            case LUA_TNUMBER:
                if(!lua_isinteger(L,top)){
                    #if defined(CPU32)
                    float v = lua_tonumber(L,top);
                    wfloat(&arg,v);
                    continue;
                    #elif defined(CPU64)
                    double v = lua_tonumber(L,top);
                    wdouble(&arg,v);
                    continue;
                    #else
                    #error "CPU BIT ?"
                    #endif 
                }
                else{
                    v = (void*)(int)lua_tointeger(L,top);
                }
                break;
            case LUA_TSTRING:
                v = (void*)lua_tostring(L,top);
                break;
            case LUA_TFUNCTION:
                if(lua_iscfunction(L,top)){
                    v = (void*)lua_tocfunction(L,top);
                }
                else{
                    //v = (void*)luaL_tolstring(L,top,0);
                    v = 0;
                }
                break;
            case LUA_TLIGHTUSERDATA:
                v = lua_touserdata(L,top);
                break;
            case LUA_TUSERDATA:
                v = *(void**)lua_touserdata(L,top);
                break;
            case LUA_TTHREAD:
                break;
            default:
                break;
        }
        wdata(&arg,v);
    }
    return &arg;
}

static int _double(lua_State*L){
    lua_Number v = luaL_checknumber(L,1);
#if defined(CPU32)
    int *p = (int*)&v;
    lua_pushinteger(L,p[0]);
    lua_pushinteger(L,p[1]);
    return 2;
#else
    lua_pushinteger(L,v);
    return 1;
#endif
}

static int stdcall(lua_State*L){
    void* func = lcheck_toptr(L,1);
    void* ret;
    getArgs(L,1);
    ret = call_stdcall(func,&arg);
    lua_Integer v = ltoint(ret);
    lua_pushinteger(L,v);
    return 1;
}

static int dcall(lua_State*L){
    void* func = lcheck_toptr(L,1);
    void* ret;
    getArgs(L,1);
    ret = call(func,&arg);
    lua_Integer v = ltoint(ret);
    lua_pushinteger(L,v);
    return 1;
}

int luaopen_dcall(lua_State*L){
    lua_pushlightuserdata(L,printf);
    lua_setglobal(L,"printf");
    lua_register(L,"dcall",dcall);  //cdecl
    lua_register(L,"stdcall",stdcall);
    lua_register(L,"double",_double);
    
    lua_register(L,"dlopen",ldlopen);
    lua_register(L,"dlclose",ldlclose);
    lua_register(L,"dlsym",ldlsym);
    
    return 0;
}
